python3 對比urllib 和BeautifulSoup 分別抓取51job內容

NO IMAGE
1 Star2 Stars3 Stars4 Stars5 Stars 給文章打分!
Loading...

一、用urllib 爬取51job

import urllib
from urllib.request import urlopen,quote
from urllib import request
import random
import re
import chardet
import xlwt
count =0
# 設定一個函式,只用來獲取網頁內容
def getHtml(url):
# 瀏覽器的代理資訊
user_agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5"
]
# 瀏覽器的代理IP
proxies = ['125.118.77.150:808']
# proxies = request.ProxyHandler({'http':'125.118.77.150:808'})
# 1、設定訪問url
req = request.Request(url)
# 增加資訊頭,從代理中隨機取
req.add_header('User-Agent', random.choice(user_agent))
# 設定代理IP地址
proxy_support = request.ProxyHandler({"http": random.choice(proxies)})
opener = request.build_opener(proxy_support)
request.install_opener(opener)
# 2、獲取訪問網站返回的物件
try:
res = urllib.request.urlopen(req)
html = res.read()
return html
except:
global count
count  = 1
# print(count)
if count > 3:
print(count, "回頭吧,兄弟")
return
getHtml(url)  # 第二次執行還是原來的地址
# 設定一個函式返回正則處理的物件
def getDatelist(num, jobname):
# 01.獲取51job網頁內容
url = "https://search.51job.com/list/000000,000000,0000,00,9,99,"   str(quote(jobname))   ",2," str(num) ".html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
# url = "https://search.51job.com/list/000000,000000,0000,00,9,99,{},2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=4&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=".format(
#     jobname, num)
# 獲取網頁物件的編碼,進行轉碼
# html = getHtml(url)
html = getHtml(url).decode('gbk')
print(html)
# 設定正規表示式
reg = re.compile(
r'<p class="t1 ">.*?<a target="_blank" title="(.*?)".*?<span class="t2"><a target="_blank" title="(.*?)".*?<span class="t3">(.*?)</span>.*?<span class="t4">(.*?)</span>.*?<span class="t5">(.*?)</span>',
re.S)
result = re.findall(reg, html)
return result
# 全域性的資料列表
datalist = []
# 向全域性datalist類新增資料
def deal(num, jobname):
global datalist
# 根據設定的頁數,用迴圈,在迴圈內多次執行獲取資料
for k in range(num):
data = getDatelist(k   1, jobname)
print('data',data)
for i in range(len(data)):
datalist.append(data[i])
#
# 設定儲存的函式
def saveExcel(jobname, filename):
# 存到本地
# 01.建立一個檔案
wbk = xlwt.Workbook(encoding='utf-8')
# 02.建立工作表
sheet = wbk.add_sheet('51job'   str(jobname)   '')
# 03.向第一行寫入資料
col = ('職位名', '公司名', '工作地點', '薪資', '釋出時間')
for i in range(len(col)):
sheet.write(0, i, col[i])
# 04.寫入result
for i in range(len(datalist)):
print("寫入第{}行".format(i   1))
for j in range(len(datalist[i])):
sheet.write(i   1, j, datalist[i][j])
# 05.儲存到檔案
wbk.save(u'data/'   filename)
print('ok')
# 設定儲存txt
def saveTxt(filename):
for i in range(len(datalist)):
data = datalist[i]
with open(u'data/'   filename, 'a') as f:
f.write(data[0]   '\t'   data[1]   '\t'   data[2]   '\t'   data[3]   '\t'   data[4])
f.close()
# 執行主函式
def mainJob(jobname, num, filename):
deal(num, jobname)
if 'txt' in filename:
saveTxt(filename)
if 'xls' in filename:
saveExcel(jobname, filename)
mainJob('測試工程師', 1, u'51job測試.xls')

二、用BeautifulSoup 抓取51job

from bs4 import BeautifulSoup
import chardet
import requests
import xlwt
url ='https://search.51job.com/list/020000,000000,0000,00,9,99,%25E4%25BA%25BA%25E5%25B7%25A5%25E6%2599%25BA%25E8%2583%25BD,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
r = requests.get(url)
code = chardet.detect(r.content)['encoding']
r.encoding = code
soup = BeautifulSoup(r.text,'html.parser')
total_list = soup.find('div',attrs={'id':'resultList'})
total = total_list.find_all('div',attrs={'class':'el'})
total.pop(0)
dataList=[]
for i in total:
data = []
e1 =i.find('p').find('a')['title']
data.append(e1)
e2 = i.find('span',attrs={'class':'t2'}).find('a').string
data.append(e2)
e3 = i.find('span',attrs={'class':'t3'}).string
data.append(e3)
e4 = i.find('span', attrs={'class': 't4'}).string
data.append(e4)
e5 = i.find('span', attrs={'class': 't5'}).string
data.append(e5)
dataList.append(data)
wbk = xlwt.Workbook()
sheet = wbk.add_sheet('python')
for i,each in enumerate(dataList):
for j,value in enumerate(each):
sheet.write(i,j,value)
wbk.save('data/python.xls')

相關文章

程式語言 最新文章