[爬虫]爬虫boss直聘
要求:
输入关键词,爬取多页数据,不进入详情页面
代码:
#单 位:常州旺龙
#作 者:OLDNI
#开发日期:2023/10/27
'''
爬虫boss直聘
要求:输入关键词,爬取多页数据,不进入详情页面
'''
import os
import time
from django.test.utils import override_script_prefix
from openpyxl import Workbook
from selenium import webdriver
from selenium.webdriver.common.by import By
#创建excel表格
wb=Workbook()
sheet=wb.active
header=['岗位','地址','薪资','工作年限','学历','面试官信息','岗位要求','公司名称','公司简介','公司福利']
sheet.append(header)
#创建浏览器
options=webdriver.ChromeOptions()
options.add_experimental_option('detach',True)
options.add_argument('--disable-blink-features=AutomationControlled')
# 隐藏"正在受到自动软件的控制"这几个字
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
browser=webdriver.Chrome(options=options)
browser.implicitly_wait(10)
#打开boss直聘
url='https://www.zhipin.com/changzhou/'
browser.get(url)
#输入关键字搜索
keyword='爬虫\n'
browser.find_element(By.CSS_SELECTOR,'.ipt-search').send_keys(keyword)
#获取分页数
pages=browser.find_element(By.CSS_SELECTOR,'div.search-job-result > div > div > div > a:nth-last-child(2)').text
pages=int(pages)
#爬取每页数据
for page in range(1,pages+1):
#从第二页起url就变了
if page >=2:
browser.get(f'https://www.zhipin.com/web/geek/job?query=%E7%88%AC%E8%99%AB&city=101191100&page={page}')
#每页30家招聘信息,也就是30个li标签
li_tag_list=browser.find_elements(By.CSS_SELECTOR,'div.job-list-wrapper > div.search-job-result > ul > li')
print(f'正在爬取第{page}页数据')
#遍历每个li标签,获取相应的信息
for li in li_tag_list:
post=li.find_element(By.CSS_SELECTOR,'.job-name').text
address=li.find_element(By.CSS_SELECTOR,'.job-area').text
salary=li.find_element(By.CSS_SELECTOR,'.salary').text
work_years=li.find_element(By.CSS_SELECTOR,'.job-info.clearfix .tag-list >li:first-child').text
education=li.find_element(By.CSS_SELECTOR,'.job-info.clearfix .tag-list >li:last-child').text
interviewer=li.find_element(By.CSS_SELECTOR,'.info-public').text
requirement=li.find_element(By.CSS_SELECTOR,'.job-card-footer.clearfix >.tag-list').text
company_name=li.find_element(By.CSS_SELECTOR,'.company-name').text
company_profile=li.find_element(By.CSS_SELECTOR,'.company-tag-list').text
company_benifits=li.find_element(By.CSS_SELECTOR,'.info-desc').text
data=[post,address,salary,work_years,education,interviewer,requirement,company_name,company_profile,company_benifits]
sheet.append(data)
print(f'已爬取第{page}页数据')
#控制爬取的页数
if int(page)==2:
break
browser.close()
keyword=keyword.strip()
wb.save(f'爬取BOSS_{keyword}_岗位信息.xlsx')
os.startfile(f'爬取BOSS_{keyword}_岗位信息.xlsx')