Python实现小说爬虫并存入阿里云OSS+MYSQL-

平时超爱看小说，但上网去看小说时，广告太多，实在是影响心情，灵机一动，决定写个爬虫来爬。考虑到以后再开发个自己专有的APP什么的，数据最好是存在网络上，于是决定将小说文本存入阿里云的OSS, 小说信息和章节信息就存入MYSQL数据库，数据库可以是云数据库，也可以是本地数据库。

爬取的小说OSS“目录”

1. 创建环境设置文件setting.py

#!/usr/bin/env python

#-*- coding=utf-8 -*-

import pymysql

HEADERS = {‘Content-type’: ‘text/html’,’User-Agent’: ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0’}

#网站的初始地址

BASE_URL=’https://www.88dushu.com’

#开始爬取的页面

START_URL=’https://www.88dushu.com/top/fullflag/’

#小说列表链接XPATH

ARTICLE_LINKS_XPATH="//div[@class=’booklist’]//span[@class=’sm’]//a"

#小说列表下一页链接XPATH

ARTICLE_NEXTPAGE_XPATH="//a[@class=’next’]"

#章节链接

CHAPTER_LINKS_XPATH="//div[@class=’mulu’]//a"

#设置是存入OSS还是本地，False存入本地

TO_OSS=True

#以下是OSS连接设置，具体请查看阿里云OSS设置

ACCESSKEYID=’your accesskeyid’

ACCESSKEYSECRET=’you access key secret’

ENDPOINT=’oss-cn-hongkong-internal.aliyuncs.com’

BUCKETNAME=’your bucket’

DB_CONFIG = {

‘host’:’127.0.0.1′,

‘port’:3306,

‘user’:’root_user’,

‘password’:’rootpassword’,

‘db’:’youdb’,

‘charset’:’utf8mb4′,

‘cursorclass’:pymysql.cursors.DictCursor

}

2. 设置要爬取的字段items.py

#!/usr/bin/env python

#-*- coding=utf-8 -*-

class Article():

# define the fields for your item here like:

articleid = str()

name = str()

keywords = str()

author = str()

largesort = str()

status = str()

lastchapter = str()

lastchapterid = str()

chapters = str() # 当前小说章节数,实时数据

database_chapters = str() # 临时存储数据库中有的章节数

intro = str()

postdate = str()

lastupdate = str()

size = str()

image_url = str()

in_database = False # 标识数据库中是否已有该小说

new_database = False # 标识有新的章节

class Chapter():

articleid = str()

article_in_database = str()

articlename = str()

chapterid = str()

chaptername = str()

chapterorder = str()

size = str()

chapterContent = str()

lastorder = str()

volumename = str()

3.创建基础类Spider,实现下载页面和下载链接功能

#!/usr/bin/env python

#-*- coding=utf-8 -*-

from .setting import *

import requests

import re

from urllib.parse import urljoin

from items import Article,Chapter

from lxml import etree

from time import sleep

class Spider():

def download_page(self,url,headers=None,retries=3):

try:

response=requests.get(url,headers=headers,verify=False) if headers!=None else requests.get(url,verify=False)

response.raise_for_status()

response.encoding=response.apparent_encoding

html=response.text

except requests.RequestException as e:

sleep(10)

html=None

if retries>0:

return self.download_page(url,headers,retries-1)

else:

return html

def download_links(self,html,base_url,links_xpath):

”’从给定的seed_url依据link_regex获得链接”’

page=etree.HTML(html)

elements=page.xpath(links_xpath)

links=[]

for element in elements:

links.append(urljoin(base_url,element.attrib[‘href’]))

return None if links.__len__()==0 else links

4.实现小说爬虫:

#!/usr/bin/env python

#-*- coding=utf-8 -*-

from spider import Spider

from setting import *

from lxml import etree

from items import Article,Chapter

import pymysql

import time

import os

import requests

import oss2

from requests.packages.urllib3.exceptions import InsecureRequestWarning

# 禁用安全请求警告

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

class dushuspider(Spider):

conn=pymysql.connect(**DB_CONFIG)

basefilepath=os.getcwd()+’/files’

baseimagepath=os.getcwd()+’/images’

chapter=None

article=None

auth = oss2.Auth(ACCESSKEYID,ACCESSKEYSECRET)

bucket = oss2.Bucket(auth, ENDPOINT, BUCKETNAME,enable_crc=False)

#解析小说函数

def parse_article(self,html,chapternum,callback=None):

try:

article_page=etree.HTML(html)

article=Article()

article.name=article_page.xpath("//div[@class=’rt’]/h1")[0].text

article.author=article_page.xpath("//meta[@property=’og:novel:author’]")[0].attrib[‘content’]

article.largesort=article_page.xpath("//meta[@property=’og:novel:category’]")[0].attrib[‘content’]

article.keywords=article_page.xpath("//meta[@name=’keywords’]")[0].attrib[‘content’]

article.intro=article_page.xpath("//meta[@property=’og:description’]")[0].attrib[‘content’]

article.status=article_page.xpath("//meta[@property=’og:novel:status’]")[0].attrib[‘content’]

article.image_url=article_page.xpath("//img")[0].attrib[‘src’]

article.chapters=chapternum

#替换器，将爬取到的关键字中不要的字符替换

remap = {

‘XX读书网’: ‘YY小说网’,

‘，’: ‘,’

}

for skey in remap:

article.keywords = article.keywords.replace(skey, remap[skey])

if callback:

callback(article)

with open(‘record.txt’, ‘a’) as f:

f.write(‘##*30’+’\n’)

f.write(str(article.articleid)+article.name+’作者:’+article.author+’章节数:’+str(article.chapters)+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))+’\n’)

print(‘##’*30)

print(article.articleid,article.name,’作者:’,article.author,’章节数:’,article.chapters)

self.article=article

except:

self.article=None

#解析章节函数

def parse_chapters(self,html,chapterorder,article,callback=None):

try:

chapter_page=etree.HTML(html)

chapter=Chapter()

chapter.chaptername=chapter_page.xpath("//div[@class=’novel’]/h1")[0].text

chapter.chapterContent =chapter_page.xpath("//div[@class=’yd_text2′]//text()")

chapter.chapterorder=chapterorder

chapter.articleid=article.articleid

if callback:

callback(article,chapter)

self.chapter=chapter

except:

self.chapter=None

def saveArticle(self,article):

with self.conn.cursor() as cur:

sql="select count(*) as ‘num’ from article where articlename=%s and author=%s"

args=(article.name,article.author)

cur.execute(sql,args)

count=cur.fetchone()[‘num’]

cur.close()

if count<=0:#说明为新的小说

with self.conn.cursor() as cur:

sql="insert into article(articlename,author,intro,keywords,largesort,status,postdate) values(%s,%s,%s,%s,%s,%s,%s)"

args=(article.name,article.author,article.intro,article.keywords,article.largesort,article.status,time.time())

cur.execute(sql,args)

self.conn.commit()

article.articleid=int(cur.lastrowid)

article.in_database=False

cur.close()

self.writeArticlePic(article,toOss=TO_OSS)

else:

article.in_database=True

with self.conn.cursor() as cur:

sql="select chapters,articleid from article where articlename=%s and author=%s"

args=(article.name,article.author)

cur.execute(sql,args)

data=cur.fetchone()

oldchapters=int(data[‘chapters’]) if data[‘chapters’] else 0#获取数据库中chapters

oldarticleid=data[‘articleid’]

article.new_database=True if oldchapters<article.chapters else False#True说明有更新的文章出现

article.articleid=oldarticleid

article.database_chapters=oldchapters

def writeArticlePic(self,article,toOss=False):

res = requests.get(article.image_url, stream=True,verify=False)

if toOss==False:

filename = self.baseimagepath + ‘/’ + str(article.articleid) + ‘.’ + article.image_url.split(‘.’)[-1]

if os.path.exists(os.path.dirname(filename))==False:

os.makedirs(os.path.dirname(filename))

with open(filename, ‘wb’) as f:

for data in res.iter_content(chunk_size=1024):

f.write(data)

f.flush()

f.close()

else:

filename = "images/{article}.{name}".format( article=str(article.articleid),

name=article.image_url.split(‘.’)[-1])

self.bucket.put_object(filename,res)

def writechapter(self,article,chapter,toOss=False):

chaptersize=0

if toOss==False:

filename = "{basepath}/{large}/{article}/{name}.txt".format(basepath=self.basefilepath,

large=str(int(article.articleid / 1000)),

article=str(article.articleid),

name=str(chapter.chapterorder))

if os.path.exists(os.path.dirname(filename)) == False:

os.makedirs(os.path.dirname(filename))

with open(filename, ‘w’, encoding=’utf-8′) as f:

for line in chapter.chapterContent:

line = line.strip()

chaptersize += len(line)

f.write(line + ‘\n’)

else:

filename = "{large}/{article}/{name}.txt".format(large=str(int(article.articleid / 1000)),

article=str(article.articleid),

name=str(chapter.chapterorder))

for i,line in enumerate(chapter.chapterContent):

chapter.chapterContent[i] = line.strip()+’\r\n’

chaptersize += len(line)

self.bucket.put_object(filename,”.join(chapter.chapterContent))

return chaptersize

def writechaptertodatabase(self,article,chapter,chaptersize):

with self.conn.cursor() as cur:

sql = "insert into chapter(chaptername,chapterorder,size,articleid,articlename,volumename) values(%s,%s,%s,%s,%s,%s)"

args = (chapter.chaptername, chapter.chapterorder, chaptersize, article.articleid, article.name, ‘正文’)

cur.execute(sql, args)

chapter.chapterid = int(cur.lastrowid)

self.conn.commit()

def saveChapters(self,article,chapter):

chaptersize=self.writechapter(article,chapter,toOss=TO_OSS)

self.writechaptertodatabase(article,chapter,chaptersize)

with open(‘record.txt’,’a’) as f:

line_text=str(article.articleid)+article.name+’—————‘+str(chapter.chapterorder)+’–of–‘+str(article.chapters)+chapter.chaptername+str(chapter.chapterid)+’—‘+’Save Succss’+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))

f.write(line_text+’\n’)

def updateSizesOfArticle(self,chapter):

with self.conn.cursor() as cur:

sql = "update article set lastchapterid=%s,lastchapter=%s where articleid=%s"

args = (chapter.chapterid, chapter.chaptername, chapter.articleid)

cur.execute(sql, args)

self.conn.commit()

with self.conn.cursor() as cur:

sql = "select count(*) as ‘num’,sum(size) as ‘chaptersizes’ from chapter where articleid=%s"

cur.execute(sql, (chapter.articleid,))

data = cur.fetchone()

chapternums = data[‘num’]

sizes = data[‘chaptersizes’]

with self.conn.cursor() as cur:

sql = "update article set chapters=%s,size=%s where articleid=%s"

args = (chapternums, sizes, self.chapter.articleid)

cur.execute(sql, args)

self.conn.commit()

def getPageArticles(self,article_page):

if article_page:

article_links = self.download_links(article_page, BASE_URL, ARTICLE_LINKS_XPATH)

if article_links:

for link in article_links:

article_page = self.download_page(link)

errortimes=0

if article_page:

chapter_links = self.download_links(article_page, link, CHAPTER_LINKS_XPATH)

# for i,chapter in enumerate(chapter_links):

# print(i,chapter)

self.parse_article(article_page,len(chapter_links),callback=self.saveArticle)

if self.article==None:

continue

if self.article.in_database==True and self.article.new_database==False :

continue

for order,c_link in enumerate(chapter_links):

if self.article.new_database==True:

if order+1<=self.article.database_chapters:

continue

chapter_page = self.download_page(c_link)

if chapter_page:

self.parse_chapters(chapter_page,order+1,self.article,self.saveChapters)

if self.chapter!=None:

self.updateSizesOfArticle(self.chapter)

else:

break

else:

with self.conn.cursor() as cur:

sql="insert into errorlog(articleid,articlename,chapterorder,chapterlink,errortime) values(%s,%s,%s,%s,%s)"

args=(self.article.articleid,self.article.name,order+1,c_link,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())))

cur.execute(sql,args)

self.conn.commit()

break

def start(self):

article_page=self.download_page(START_URL)

while True:

self.getPageArticles(article_page)

next_url = self.download_links(article_page, BASE_URL, ARTICLE_NEXTPAGE_XPATH)

if next_url:

next_page = self.download_page(next_url[0])

if next_page:

article_page=next_page

else:

print(‘All Article been download,system exit.’)

break

self.close()

def close(self):

self.conn.close()

if __name__==’__main__’:

s=dushuspider()

s.start()

OSS“目录”下的小说爬取下来的小说内容

Python实现小说爬虫并存入阿里云OSS+MYSQL

评论抢沙发

文章归档

评论 抢沙发

文章归档

评论抢沙发