[爬虫]爬取百度热搜(excel)
要求
爬取的数据写入excel表格
代码
#单 位:常州旺龙
#作 者:OLDNI
#开发日期:2023/11/16
'''
爬取进度热搜:https://top.baidu.com/board?tab=realtime
生成excel表格
'''
import os
from openpyxl import Workbook
from openpyxl.styles import Font, PatternFill, Alignment
from selenium import webdriver
from selenium.webdriver.common.by import By
options=webdriver.ChromeOptions()
#解决闪退
options.add_experimental_option('detach',True)
#无头模式
options.add_argument('--headless')
#让浏览器更难检测到自动化软件的存在
options.add_argument('--disable-blink-features=AutomationControlled')
# 隐藏"正在受到自动软件的控制"这几个字
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
#创建浏览器对象
browser=webdriver.Chrome(options=options)
#打开指定网址
browser.get('https://top.baidu.com/board?tab=realtime')
#创建excel表格
wb=Workbook()
sheet=wb.active
header=['排名','标题','简介',]
sheet.append(header)
#获取循环标签的列表
div_tag_list=browser.find_elements(By.CSS_SELECTOR,'.category-wrap_iQLoo.horizontal_1eKyQ')
#排名
ranking1=range(1,51)
r_l=['置顶']
for r in ranking1:
r_l.append(r)
#从每个标签内获取数据
n=0 #用于排名列表下标
for div_tag in div_tag_list:
ranking=r_l[n]
title=div_tag.find_element(By.CSS_SELECTOR,'.content_1YWBm .c-single-text-ellipsis').text
profile=div_tag.find_element(By.CSS_SELECTOR,'.content_1YWBm >div:nth-child(2)').text.split('查看更多>')[0].strip()
data=[ranking,title,profile]
sheet.append(data)
n+=1
browser.close()
#设置列宽
column_number_and_dimensions={'A':6,'B':36,'C':128,}
for n,d in column_number_and_dimensions.items():
sheet.column_dimensions[n].width = d
#设置表头样式
cells=sheet[1]
font=Font(bold=True)
patternfill=PatternFill(fill_type='solid',fgColor='FFFF00')
alignment=Alignment(horizontal='center')
for cell in cells:
cell.font=font
cell.fill=patternfill
cell.alignment=alignment
#冻结窗格
sheet.freeze_panes='A2'
wb.save('百度热搜.xlsx')
os.startfile('百度热搜.xlsx')