[爬虫]爬取携程指定酒店的评价汇总

作者: oldboy 分类: Python 发布时间: 2023-10-25 17:43

要求:

1、爬取多家指定酒店的评价汇总

代码:

#单   位:常州旺龙
#作   者:OLDNI
#开发日期:2023/10/24

import json
import os
import re
import time
from datetime import datetime, timedelta

from bs4 import BeautifulSoup
from openpyxl import Workbook
from selenium import webdriver
from selenium.webdriver.common.by import By

today_obj=datetime.now()
today_date=today_obj.strftime('%Y-%m-%d')
date_interval=timedelta(days=1)
tomorrow_obj=today_obj+date_interval
tomorrow_date=tomorrow_obj.strftime('%Y-%m-%d')

#此处添加酒店
hotels_dict={
    '淄博张店希尔顿欢朋酒店':f'https://hotels.ctrip.com/hotels/detail/?hotelId=1379642&checkIn={today_date}&checkOut={tomorrow_date}&cityId=542',
    '宜兴大酒店':f'https://hotels.ctrip.com/hotels/detail/?hotelId=666485&checkIn={today_date}&checkOut={tomorrow_date}&cityId=537',
    '淄博海悦瑞景酒店':f'https://hotels.ctrip.com/hotels/detail/?hotelId=95265902&checkIn={today_date}&checkOut={tomorrow_date}&cityId=542',
    '淄博莫兰迪酒店(八大局店)':f'https://hotels.ctrip.com/hotels/detail/?hotelId=100542386&checkIn={today_date}&checkOut={tomorrow_date}&cityId=542',
    '凤凰酒店雅居(平顶山市开源路步行街店)':f'https://hotels.ctrip.com/hotels/detail/?hotelId=108599586&checkIn={today_date}&checkOut={tomorrow_date}&cityId=3222',
    '全季酒店(平顶山万达广场店)':f'https://hotels.ctrip.com/hotels/detail/?hotelId=105635773&checkIn={today_date}&checkOut={tomorrow_date}&cityId=3222',
    '平顶山翔羽智慧酒店(文化宫店)':f'https://hotels.ctrip.com/hotels/detail/?hotelId=96521686&checkIn={today_date}&checkOut={tomorrow_date}&cityId=3222',
    '武汉光谷科技会展中心希尔顿欢朋酒店':f'https://hotels.ctrip.com/hotels/detail/?hotelId=110707379&checkIn={today_date}&checkOut={tomorrow_date}&cityId=477',
    '武汉光谷漫心酒店':f'https://hotels.ctrip.com/hotels/detail/?hotelId=76505443&checkIn={today_date}&checkOut={tomorrow_date}&cityId=477',
    '武汉恺德光谷城际酒店':f'https://hotels.ctrip.com/hotels/detail/?hotelId=100465216&checkIn={today_date}&checkOut={tomorrow_date}&cityId=477',
    '桔子酒店(武汉光谷大学园路店)':f'https://hotels.ctrip.com/hotels/detail/?hotelId=105595985&checkIn={today_date}&checkOut={tomorrow_date}&cityId=477',
    # '':'',
}
print(f'正在爬取{len(hotels_dict)}家酒店')

#创建浏览器
options=webdriver.ChromeOptions()
options.add_experimental_option('detach',True)
options.add_argument('--disable-blink-features=AutomationControlled')
browser=webdriver.Chrome(options=options)
browser.implicitly_wait(10)

# 登录携程页面,使用cookies登录,方便打开详情页
browser.get('https://hotels.ctrip.com/hotels/list?countryId=1&city=537&checkin=2023/10/19&checkout=2023/10/20&optionId=537')
# 添加cookies信息
with open('15312585581_xc_cookies.json', encoding='utf-8') as f:
    cookies = json.loads(f.read())
    for x in cookies:
        browser.add_cookie(x)
browser.get('https://hotels.ctrip.com/hotels/list?countryId=1&city=537&checkin=2023/10/19&checkout=2023/10/20&optionId=537')

#创建excel表
wb=Workbook()
sheet=wb.active
header=['酒店名称','钻级分','点评分','点评条数','简介','URL','点评汇总']
sheet.append(header)

#酒店名是从A2开始对应第一家,A3对应第二家,依次类推.用于关联存放酒店详情页数据的sheet表
n=2

for hotel_name,hotel_url in hotels_dict.items():
    print(hotel_name,hotel_url)
    #打开新的标签页
    new_table = f'window.open("{hotel_url}")'
    browser.execute_script(new_table)
    time.sleep(3)
    browser.switch_to.window(browser.window_handles[-1])

    html = browser.page_source
    soup = BeautifulSoup(html, 'html.parser')

    level = soup.select_one('div.detail-headline_title > img')['src'].split('_')[3]
    score=soup.select_one('span.detail-headreview_score_box > b').text
    #评分超过1000的写法是1,000  多了逗号。
    #这里的思路是把所有的数字找出来,再组合成一个字符
    score_number_init=soup.select_one('p.detail-headreview_all').text
    score_number_list= re.findall('\d+', score_number_init)
    score_number=''
    for s in score_number_list:
        score_number+=s
    profile=soup.select_one('.basic-sub.clearfix').text
    data=[hotel_name,level,score,score_number,profile,hotel_url]
    # sheet.append(data)

    # 创建每家以酒店名称命名的数据表并关联
    # wb.create_sheet(hotel_name)
    # location = 'A' + str(n)
    # n += 1
    # cell = sheet[location]
    # cell.value = f'=HYPERLINK("#\'{cell.value}\'!A1","{cell.value}")'
    # sheet=wb[hotel_name]
    # header=['点评汇总']
    # sheet.append(header)

    #点击点评
    # browser.find_element(By.CSS_SELECTOR,'div.detail-hotelnavi_list > span:nth-child(2)').click()
    comment_tags_list=soup.select('.u-btn.u-btn-filter.u-btn-sm.u-btn-radiuslg')
    print(f'共有{len(comment_tags_list)}个标签')
    data_comment=[]
    for comment_tag in comment_tags_list:
        data_comment.append(comment_tag.text)
    data_all=data+data_comment
    sheet.append(data_all)
    sheet=wb['Sheet']
    #关闭新开的标签页
    browser.close()
    browser.switch_to.window(browser.window_handles[0])
    time.sleep(2)

#关闭浏览器(最后一个标签页)
# browser.close()
wb.save('获取指定酒店的点评.xlsx')
os.startfile('获取指定酒店的点评.xlsx')

效果:

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注

标签云