[爬虫]爬取知网信息

作者: oldboy 分类: Python 发布时间: 2023-10-26 15:03

代码:


"""
爬取知网数据
"""
import os
import time

import openpyxl
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By

options=webdriver.ChromeOptions()
options.add_experimental_option('detach',True)

browser=webdriver.Chrome(options=options)
browser.get('https://kns.cnki.net/kns8')
time.sleep(2)
search_box=browser.find_element(By.ID,'txt_search')
search_text='常州新能源'
enter_key='\n'
search_box.send_keys(search_text+enter_key)
time.sleep(2)

#判断当前目录下是否存在excel表格
if os.path.exists(f'知网_{search_text}.xlsx'):
    os.remove(f'知网_{search_text}.xlsx')
a_list=browser.find_elements(By.CLASS_NAME,'fz14')

wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = search_text
header = ['标题', '作者', '时间', '摘要', '关键字', '文章']
sheet.append(header)

#获取所有的分页数
page_a_tag_list=browser.find_elements(By.CSS_SELECTOR,'.pages > a')
pages=len(page_a_tag_list)

def crawl_pages(pages):
    pages=pages+1
    for p in range(1,pages):
        # 每页文章的a标签
        a = browser.find_elements(By.CLASS_NAME, 'fz14')[1]
        a.click()
        time.sleep(2)
        browser.switch_to.window(browser.window_handles[-1])
        html = browser.page_source
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.select_one('h1').text.strip()
        try:
            author = soup.select_one('h3 > span > a').text.strip()
        except:
            author=''
        try:
            abstract = soup.select_one('div.brief > div:nth-child(2) > p').text.strip()
        except:
            abstract = ''
        text = soup.select_one('.abstract-text').text
        try:
            date = soup.select_one('div.doc-top > div:nth-child(4) > p').text.strip()
        except:
            date = ''
        keywords_list = soup.select('div.brief > div:nth-child(4) > p >a')
        keywords = ''
        for keywords_e in keywords_list:
            keywords_e = keywords_e.text.strip()
            if not keywords_e:
                break
            keywords = keywords + keywords_e
        data = [title, author, date, abstract, keywords, text]
        sheet.append(data)
        browser.close()
        browser.switch_to.window(browser.window_handles[0])
        #点击翻页
        if p==8:
            break
        p = browser.find_element(By.ID, 'PageNext')
        p.click()
        time.sleep(2)
    wb.save(f'知网_{search_text}.xlsx')
    browser.close()

crawl_pages(pages)

如果觉得我的文章对您有用,请随意打赏。您的支持将鼓励我继续创作!

发表回复

您的电子邮箱地址不会被公开。 必填项已用*标注

标签云