[爬虫]爬虫boss直聘

作者: oldboy 分类: Python 发布时间: 2024-01-02 21:38

要求:

输入关键词,爬取多页数据,不进入详情页面

代码:

#单   位:常州旺龙
#作   者:OLDNI
#开发日期:2023/10/27

'''
爬虫boss直聘
要求:输入关键词,爬取多页数据,不进入详情页面
'''
import os
import time

from django.test.utils import override_script_prefix
from openpyxl import Workbook
from selenium import webdriver
from selenium.webdriver.common.by import By

#创建excel表格
wb=Workbook()
sheet=wb.active
header=['岗位','地址','薪资','工作年限','学历','面试官信息','岗位要求','公司名称','公司简介','公司福利']
sheet.append(header)
#创建浏览器
options=webdriver.ChromeOptions()
options.add_experimental_option('detach',True)
options.add_argument('--disable-blink-features=AutomationControlled')
# 隐藏"正在受到自动软件的控制"这几个字
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
browser=webdriver.Chrome(options=options)
browser.implicitly_wait(10)

#打开boss直聘
url='https://www.zhipin.com/changzhou/'
browser.get(url)

#输入关键字搜索
keyword='爬虫\n'
browser.find_element(By.CSS_SELECTOR,'.ipt-search').send_keys(keyword)

#获取分页数
pages=browser.find_element(By.CSS_SELECTOR,'div.search-job-result > div > div > div > a:nth-last-child(2)').text
pages=int(pages)
#爬取每页数据
for page in range(1,pages+1):
    #从第二页起url就变了
    if page >=2:
        browser.get(f'https://www.zhipin.com/web/geek/job?query=%E7%88%AC%E8%99%AB&city=101191100&page={page}')
    #每页30家招聘信息,也就是30个li标签
    li_tag_list=browser.find_elements(By.CSS_SELECTOR,'div.job-list-wrapper > div.search-job-result > ul > li')
    print(f'正在爬取第{page}页数据')
    #遍历每个li标签,获取相应的信息
    for li in li_tag_list:
        post=li.find_element(By.CSS_SELECTOR,'.job-name').text
        address=li.find_element(By.CSS_SELECTOR,'.job-area').text
        salary=li.find_element(By.CSS_SELECTOR,'.salary').text
        work_years=li.find_element(By.CSS_SELECTOR,'.job-info.clearfix .tag-list >li:first-child').text
        education=li.find_element(By.CSS_SELECTOR,'.job-info.clearfix .tag-list >li:last-child').text
        interviewer=li.find_element(By.CSS_SELECTOR,'.info-public').text
        requirement=li.find_element(By.CSS_SELECTOR,'.job-card-footer.clearfix >.tag-list').text
        company_name=li.find_element(By.CSS_SELECTOR,'.company-name').text
        company_profile=li.find_element(By.CSS_SELECTOR,'.company-tag-list').text
        company_benifits=li.find_element(By.CSS_SELECTOR,'.info-desc').text
        data=[post,address,salary,work_years,education,interviewer,requirement,company_name,company_profile,company_benifits]
        sheet.append(data)
    print(f'已爬取第{page}页数据')
    #控制爬取的页数
    if int(page)==2:
        break
browser.close()
keyword=keyword.strip()
wb.save(f'爬取BOSS_{keyword}_岗位信息.xlsx')
os.startfile(f'爬取BOSS_{keyword}_岗位信息.xlsx')

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注

标签云