[爬虫]爬取知网信息
代码:
"""
爬取知网数据
"""
import os
import time
import openpyxl
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
options=webdriver.ChromeOptions()
options.add_experimental_option('detach',True)
browser=webdriver.Chrome(options=options)
browser.get('https://kns.cnki.net/kns8')
time.sleep(2)
search_box=browser.find_element(By.ID,'txt_search')
search_text='常州新能源'
enter_key='\n'
search_box.send_keys(search_text+enter_key)
time.sleep(2)
#判断当前目录下是否存在excel表格
if os.path.exists(f'知网_{search_text}.xlsx'):
os.remove(f'知网_{search_text}.xlsx')
a_list=browser.find_elements(By.CLASS_NAME,'fz14')
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = search_text
header = ['标题', '作者', '时间', '摘要', '关键字', '文章']
sheet.append(header)
#获取所有的分页数
page_a_tag_list=browser.find_elements(By.CSS_SELECTOR,'.pages > a')
pages=len(page_a_tag_list)
def crawl_pages(pages):
pages=pages+1
for p in range(1,pages):
# 每页文章的a标签
a = browser.find_elements(By.CLASS_NAME, 'fz14')[1]
a.click()
time.sleep(2)
browser.switch_to.window(browser.window_handles[-1])
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')
title = soup.select_one('h1').text.strip()
try:
author = soup.select_one('h3 > span > a').text.strip()
except:
author=''
try:
abstract = soup.select_one('div.brief > div:nth-child(2) > p').text.strip()
except:
abstract = ''
text = soup.select_one('.abstract-text').text
try:
date = soup.select_one('div.doc-top > div:nth-child(4) > p').text.strip()
except:
date = ''
keywords_list = soup.select('div.brief > div:nth-child(4) > p >a')
keywords = ''
for keywords_e in keywords_list:
keywords_e = keywords_e.text.strip()
if not keywords_e:
break
keywords = keywords + keywords_e
data = [title, author, date, abstract, keywords, text]
sheet.append(data)
browser.close()
browser.switch_to.window(browser.window_handles[0])
#点击翻页
if p==8:
break
p = browser.find_element(By.ID, 'PageNext')
p.click()
time.sleep(2)
wb.save(f'知网_{search_text}.xlsx')
browser.close()
crawl_pages(pages)