[爬虫]爬取携程指定酒店7天的数据
要求:
1、指定酒店
代码:
#单 位:常州旺龙
#作 者:OLDNI
#开发日期:2023/10/14
"""
爬取携程指定酒店7天的数据
"""
import json
import os
import re
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
from openpyxl import Workbook
from selenium import webdriver
# 定义要爬取的酒店列表
from selenium.webdriver.common.by import By
hotels_init_dict={
'宜兴氿洲开元名都大酒店':'https://hotels.ctrip.com/hotels/detail/?hotelId=6337412&checkIn=2023-10-19&checkOut=2023-10-20&cityId=537',
}
#输入第一天的日期,后6天自动生成(注:这是间隔一天的查询)
checkIn=input('请输入入住日期(例如:2023-10-19(默认间隔一天,2023-10-20退房):')
# checkOut=input('请输入退房日期(例如:2023-10-20):')
first_day=datetime.strptime(checkIn,'%Y-%m-%d')
#后6天的日期
day_interval=timedelta(days=1)
date_list=[checkIn,(first_day+day_interval).strftime('%Y-%m-%d'),
(first_day+day_interval*2).strftime('%Y-%m-%d'),(first_day+day_interval*3).strftime('%Y-%m-%d'),
(first_day+day_interval*4).strftime('%Y-%m-%d'),(first_day+day_interval*5).strftime('%Y-%m-%d'),
(first_day+day_interval*6).strftime('%Y-%m-%d'),(first_day+day_interval*7).strftime('%Y-%m-%d'),
]
hotel_number=len(hotels_init_dict)
print(f'正在爬取{hotel_number}家酒店')
#创建浏览器
options=webdriver.ChromeOptions()
options.add_experimental_option('detach',True)
options.add_argument('--disable-blink-features=AutomationControlled')
browser=webdriver.Chrome(options=options)
#创建excel表
wb=Workbook()
sheet=wb.active
header=['酒店名称','起始日期','结束日期','第1天的URL','第2天的URL','第3天的URL','第4天的URL','第5天的URL','第6天的URL','第7天的URL',]
sheet.append(header)
#酒店名是从A2开始对应第一家,A3对应第二家,依次类推.用于关联存放酒店详情页数据的sheet表
n=2
#遍历每家酒店,打开详情页爬取数据
for hotel_name,hotel_url_init in hotels_init_dict.items():
hotels_url_spilt= re.split('&|=',hotel_url_init)
hotelid=hotels_url_spilt[1]
cityid=hotels_url_spilt[-1]
new_hotels_list=[
f'https://hotels.ctrip.com/hotels/detail/?hotelId={hotelid}&checkIn={date_list[0]}&checkOut={date_list[1]}&cityId={cityid}',
f'https://hotels.ctrip.com/hotels/detail/?hotelId={hotelid}&checkIn={date_list[1]}&checkOut={date_list[2]}&cityId={cityid}',
f'https://hotels.ctrip.com/hotels/detail/?hotelId={hotelid}&checkIn={date_list[2]}&checkOut={date_list[3]}&cityId={cityid}',
f'https://hotels.ctrip.com/hotels/detail/?hotelId={hotelid}&checkIn={date_list[3]}&checkOut={date_list[4]}&cityId={cityid}',
f'https://hotels.ctrip.com/hotels/detail/?hotelId={hotelid}&checkIn={date_list[4]}&checkOut={date_list[5]}&cityId={cityid}',
f'https://hotels.ctrip.com/hotels/detail/?hotelId={hotelid}&checkIn={date_list[5]}&checkOut={date_list[6]}&cityId={cityid}',
f'https://hotels.ctrip.com/hotels/detail/?hotelId={hotelid}&checkIn={date_list[6]}&checkOut={date_list[7]}&cityId={cityid}',
]
hotel_url_number=len(new_hotels_list)
print(f'正在爬取{hotel_url_number}条URL')
#登录携程首页,保存cookies,方便打开详情页
browser.get('https://hotels.ctrip.com/hotels/list?countryId=1&city=537&checkin=2023/10/19&checkout=2023/10/20&optionId=537&optionType=City&directSearch=0&display=%E5%AE%9C%E5%85%B4%2C%20%E6%B1%9F%E8%8B%8F%2C%20%E4%B8%AD%E5%9B%BD&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&')
# 添加cookies信息
with open('xc_cookies.json', encoding='utf-8') as f:
cookies = json.loads(f.read())
for x in cookies:
browser.add_cookie(x)
browser.get('https://hotels.ctrip.com/hotels/list?countryId=1&city=537&checkin=2023/10/19&checkout=2023/10/20&optionId=537&optionType=City&directSearch=0&display=%E5%AE%9C%E5%85%B4%2C%20%E6%B1%9F%E8%8B%8F%2C%20%E4%B8%AD%E5%9B%BD&crn=1&adult=1&children=0&searchBoxArg=t&travelPurpose=0&ctm_ref=ix_sb_dl&domestic=1&')
#把每家的信息写入excel的Sheet表
#一般会设置爬取2天的数据用于测试
first_url=new_hotels_list[0]
second_url=new_hotels_list[1]
try:
three_url=new_hotels_list[2]
except:
three_url=''
try:
four_url=new_hotels_list[3]
except:
four_url=''
try:
five_url=new_hotels_list[4]
except:
five_url=''
try:
six_url=new_hotels_list[5]
except:
six_url=''
try:
seven_url=new_hotels_list[6]
except:
seven_url=''
data=[hotel_name,date_list[0],date_list[-1],first_url,second_url,three_url,four_url,five_url,six_url,seven_url]
sheet.append(data)
#创建每家以酒店名称命名的数据表并关联
wb.create_sheet(hotel_name)
location = 'A' + str(n)
n += 1
cell = sheet[location]
cell.value = f'=HYPERLINK("#\'{cell.value}\'!A1","{cell.value}")'
sheet=wb[hotel_name]
header=['日期','酒店名称','房型','餐饮','代理商','规则','价格','特色','支付类型']
sheet.append(header)
# 隐匿等待
browser.implicitly_wait(10)
#用于判断是否写日期到excel表内
i = 1
#爬取不同日期的详情页
for new_hotel in new_hotels_list:
new_table = f'window.open("{new_hotel}")'
browser.execute_script(new_table)
browser.switch_to.window(browser.window_handles[-1])
print(f'已打开详情页:{new_hotel}')
# 详情页内有的房型点击展开才能看到数据
expand_list = browser.find_elements(By.CSS_SELECTOR, '.saleroomlist-showMore > div')
print(f'当前页面展开的数量是:{len(expand_list)}')
if expand_list:
for expand in expand_list:
try:
expand.click()
except Exception as e:
print(e)
html=browser.page_source
soup=BeautifulSoup(html,'html.parser')
# 获取详情页酒店的名称
hotel_name = soup.select_one('h1').text
# 获取正在处理的页面入住日期
date_c = f'{date_list[(i - 1)]}-{date_list[i]}'
# 获取所有房型div列表
room_div_list = soup.select('.roomlist-baseroom>div')
for room_div in room_div_list:
#获取房型名称
room_name = room_div.select_one('.roomname').text
# 获取房型名特色
feature = room_div.select_one('.roompanel-facility-desc').text
#判断支付方式的数量
pay_div_list=room_div.select('.salecardlist-rooms>div')
pay_div_number=len(pay_div_list)
# 一个房型多种支付方式的话,只在第一支付方式显示酒店名,第二种支付方式不显示酒店名
k = 1
for pay_div in pay_div_list:
# 获取提供餐饮
food = pay_div.select_one('.bm-item').text
#获取是否是代理商提供,没有就为空
try:
vendor=pay_div.select_one('.vendor-desc').text
except:
vendor=''
#获取规则
policy=pay_div.select_one('.salecard-policy').text
# 如果是售完就没有价格了
try:
price = pay_div.select_one('.price-display.price-display-hover').text
except:
price = '已售完'
# 获取支付方式
pay_type = pay_div.select_one('.paytype').text
if pay_div_number == 1:
# if i==1 and k == 1:
if k == 1:
data = [date_c,hotel_name, room_name, food, vendor,policy,price, feature, pay_type]
date_c=''
# elif i!=1 and k == 1:
# data = ['', '', room_name, food, price, feature, pay_type]
else:
data = ['', '', '', food, vendor,policy,price, feature,pay_type]
sheet.append(data)
else:
# 一个房型多种支付方式的话,只在第一支付方式显示酒店名,第二种支付方式不显示酒店名
# if i==1 and k == 1:
if k == 1:
data = [date_c,hotel_name, room_name, food, vendor,policy,price, feature, pay_type]
date_c = ''
# print(f'这是详情页的数据:{data}')
# elif i!=1 and k == 1:
# data = ['', '', room_name, food, price, feature, pay_type]
else:
data = ['', '', '',food, vendor,policy,price, feature, pay_type]
# print(f'这是详情页的数据:{data}')
sheet.append(data)
k += 1
i += 1
# 关闭详情页
browser.close()
print(f'已关闭详情页:{new_hotel}')
browser.switch_to.window(browser.window_handles[0])
#控制浏览器是否关闭
browser.close()
wb.save(f'100家酒店7天数据.xlsx')
os.startfile(f'100家酒店7天数据.xlsx')
效果: