平时超爱看小说,但上网去看小说时,广告太多,实在是影响心情,灵机一动,决定写个爬虫来爬。考虑到以后再开发个自己专有的APP什么的,数据最好是存在网络上,于是决定将小说文本存入阿里云的OSS, 小说信息和章节信息就存入MYSQL数据库,数据库可以是云数据库,也可以是本地数据库。
爬取的小说OSS“目录”
1. 创建环境设置文件setting.py
#!/usr/bin/env python
#-*- coding=utf-8 -*-
import pymysql
HEADERS = {‘Content-type’: ‘text/html’,’User-Agent’: ‘Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0’}
#网站的初始地址
BASE_URL=’https://www.88dushu.com’
#开始爬取的页面
START_URL=’https://www.88dushu.com/top/fullflag/’
#小说列表链接XPATH
ARTICLE_LINKS_XPATH="//div[@class=’booklist’]//span[@class=’sm’]//a"
#小说列表下一页链接XPATH
ARTICLE_NEXTPAGE_XPATH="//a[@class=’next’]"
#章节链接
CHAPTER_LINKS_XPATH="//div[@class=’mulu’]//a"
#设置是存入OSS还是本地,False存入本地
TO_OSS=True
#以下是OSS连接设置,具体请查看阿里云OSS设置
ACCESSKEYID=’your accesskeyid’
ACCESSKEYSECRET=’you access key secret’
ENDPOINT=’oss-cn-hongkong-internal.aliyuncs.com’
BUCKETNAME=’your bucket’
DB_CONFIG = {
‘host’:’127.0.0.1′,
‘port’:3306,
‘user’:’root_user’,
‘password’:’rootpassword’,
‘db’:’youdb’,
‘charset’:’utf8mb4′,
‘cursorclass’:pymysql.cursors.DictCursor
}
2. 设置要爬取的字段items.py
#!/usr/bin/env python
#-*- coding=utf-8 -*-
class Article():
# define the fields for your item here like:
articleid = str()
name = str()
keywords = str()
author = str()
largesort = str()
status = str()
lastchapter = str()
lastchapterid = str()
chapters = str() # 当前小说章节数,实时数据
database_chapters = str() # 临时存储数据库中有的章节数
intro = str()
postdate = str()
lastupdate = str()
size = str()
image_url = str()
in_database = False # 标识数据库中是否已有该小说
new_database = False # 标识有新的章节
class Chapter():
articleid = str()
article_in_database = str()
articlename = str()
chapterid = str()
chaptername = str()
chapterorder = str()
size = str()
chapterContent = str()
lastorder = str()
volumename = str()
3.创建基础类Spider,实现下载页面和下载链接功能
#!/usr/bin/env python
#-*- coding=utf-8 -*-
from .setting import *
import requests
import re
from urllib.parse import urljoin
from items import Article,Chapter
from lxml import etree
from time import sleep
class Spider():
def download_page(self,url,headers=None,retries=3):
try:
response=requests.get(url,headers=headers,verify=False) if headers!=None else requests.get(url,verify=False)
response.raise_for_status()
response.encoding=response.apparent_encoding
html=response.text
except requests.RequestException as e:
sleep(10)
html=None
if retries>0:
return self.download_page(url,headers,retries-1)
else:
return html
def download_links(self,html,base_url,links_xpath):
”’从给定的seed_url依据link_regex获得链接”’
page=etree.HTML(html)
elements=page.xpath(links_xpath)
links=[]
for element in elements:
links.append(urljoin(base_url,element.attrib[‘href’]))
return None if links.__len__()==0 else links
4.实现小说爬虫:
#!/usr/bin/env python
#-*- coding=utf-8 -*-
from spider import Spider
from setting import *
from lxml import etree
from items import Article,Chapter
import pymysql
import time
import os
import requests
import oss2
from requests.packages.urllib3.exceptions import InsecureRequestWarning
# 禁用安全请求警告
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
class dushuspider(Spider):
conn=pymysql.connect(**DB_CONFIG)
basefilepath=os.getcwd()+’/files’
baseimagepath=os.getcwd()+’/images’
chapter=None
article=None
auth = oss2.Auth(ACCESSKEYID,ACCESSKEYSECRET)
bucket = oss2.Bucket(auth, ENDPOINT, BUCKETNAME,enable_crc=False)
#解析小说函数
def parse_article(self,html,chapternum,callback=None):
try:
article_page=etree.HTML(html)
article=Article()
article.name=article_page.xpath("//div[@class=’rt’]/h1")[0].text
article.author=article_page.xpath("//meta[@property=’og:novel:author’]")[0].attrib[‘content’]
article.largesort=article_page.xpath("//meta[@property=’og:novel:category’]")[0].attrib[‘content’]
article.keywords=article_page.xpath("//meta[@name=’keywords’]")[0].attrib[‘content’]
article.intro=article_page.xpath("//meta[@property=’og:description’]")[0].attrib[‘content’]
article.status=article_page.xpath("//meta[@property=’og:novel:status’]")[0].attrib[‘content’]
article.image_url=article_page.xpath("//img")[0].attrib[‘src’]
article.chapters=chapternum
#替换器,将爬取到的关键字中不要的字符替换
remap = {
‘XX读书网’: ‘YY小说网’,
‘,’: ‘,’
}
for skey in remap:
article.keywords = article.keywords.replace(skey, remap[skey])
if callback:
callback(article)
with open(‘record.txt’, ‘a’) as f:
f.write(‘##*30’+’\n’)
f.write(str(article.articleid)+article.name+’作者:’+article.author+’章节数:’+str(article.chapters)+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))+’\n’)
print(‘##’*30)
print(article.articleid,article.name,’作者:’,article.author,’章节数:’,article.chapters)
self.article=article
except:
self.article=None
#解析章节函数
def parse_chapters(self,html,chapterorder,article,callback=None):
try:
chapter_page=etree.HTML(html)
chapter=Chapter()
chapter.chaptername=chapter_page.xpath("//div[@class=’novel’]/h1")[0].text
chapter.chapterContent =chapter_page.xpath("//div[@class=’yd_text2′]//text()")
chapter.chapterorder=chapterorder
chapter.articleid=article.articleid
if callback:
callback(article,chapter)
self.chapter=chapter
except:
self.chapter=None
def saveArticle(self,article):
with self.conn.cursor() as cur:
sql="select count(*) as ‘num’ from article where articlename=%s and author=%s"
args=(article.name,article.author)
cur.execute(sql,args)
count=cur.fetchone()[‘num’]
cur.close()
if count<=0:#说明为新的小说
with self.conn.cursor() as cur:
sql="insert into article(articlename,author,intro,keywords,largesort,status,postdate) values(%s,%s,%s,%s,%s,%s,%s)"
args=(article.name,article.author,article.intro,article.keywords,article.largesort,article.status,time.time())
cur.execute(sql,args)
self.conn.commit()
article.articleid=int(cur.lastrowid)
article.in_database=False
cur.close()
self.writeArticlePic(article,toOss=TO_OSS)
else:
article.in_database=True
with self.conn.cursor() as cur:
sql="select chapters,articleid from article where articlename=%s and author=%s"
args=(article.name,article.author)
cur.execute(sql,args)
data=cur.fetchone()
oldchapters=int(data[‘chapters’]) if data[‘chapters’] else 0#获取数据库中chapters
oldarticleid=data[‘articleid’]
article.new_database=True if oldchapters<article.chapters else False#True说明有更新的文章出现
article.articleid=oldarticleid
article.database_chapters=oldchapters
def writeArticlePic(self,article,toOss=False):
res = requests.get(article.image_url, stream=True,verify=False)
if toOss==False:
filename = self.baseimagepath + ‘/’ + str(article.articleid) + ‘.’ + article.image_url.split(‘.’)[-1]
if os.path.exists(os.path.dirname(filename))==False:
os.makedirs(os.path.dirname(filename))
with open(filename, ‘wb’) as f:
for data in res.iter_content(chunk_size=1024):
f.write(data)
f.flush()
f.close()
else:
filename = "images/{article}.{name}".format( article=str(article.articleid),
name=article.image_url.split(‘.’)[-1])
self.bucket.put_object(filename,res)
def writechapter(self,article,chapter,toOss=False):
chaptersize=0
if toOss==False:
filename = "{basepath}/{large}/{article}/{name}.txt".format(basepath=self.basefilepath,
large=str(int(article.articleid / 1000)),
article=str(article.articleid),
name=str(chapter.chapterorder))
if os.path.exists(os.path.dirname(filename)) == False:
os.makedirs(os.path.dirname(filename))
with open(filename, ‘w’, encoding=’utf-8′) as f:
for line in chapter.chapterContent:
line = line.strip()
chaptersize += len(line)
f.write(line + ‘\n’)
else:
filename = "{large}/{article}/{name}.txt".format(large=str(int(article.articleid / 1000)),
article=str(article.articleid),
name=str(chapter.chapterorder))
for i,line in enumerate(chapter.chapterContent):
chapter.chapterContent[i] = line.strip()+’\r\n’
chaptersize += len(line)
self.bucket.put_object(filename,”.join(chapter.chapterContent))
return chaptersize
def writechaptertodatabase(self,article,chapter,chaptersize):
with self.conn.cursor() as cur:
sql = "insert into chapter(chaptername,chapterorder,size,articleid,articlename,volumename) values(%s,%s,%s,%s,%s,%s)"
args = (chapter.chaptername, chapter.chapterorder, chaptersize, article.articleid, article.name, ‘正文’)
cur.execute(sql, args)
chapter.chapterid = int(cur.lastrowid)
self.conn.commit()
def saveChapters(self,article,chapter):
chaptersize=self.writechapter(article,chapter,toOss=TO_OSS)
self.writechaptertodatabase(article,chapter,chaptersize)
with open(‘record.txt’,’a’) as f:
line_text=str(article.articleid)+article.name+’—————‘+str(chapter.chapterorder)+’–of–‘+str(article.chapters)+chapter.chaptername+str(chapter.chapterid)+’—‘+’Save Succss’+time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))
f.write(line_text+’\n’)
def updateSizesOfArticle(self,chapter):
with self.conn.cursor() as cur:
sql = "update article set lastchapterid=%s,lastchapter=%s where articleid=%s"
args = (chapter.chapterid, chapter.chaptername, chapter.articleid)
cur.execute(sql, args)
self.conn.commit()
with self.conn.cursor() as cur:
sql = "select count(*) as ‘num’,sum(size) as ‘chaptersizes’ from chapter where articleid=%s"
cur.execute(sql, (chapter.articleid,))
data = cur.fetchone()
chapternums = data[‘num’]
sizes = data[‘chaptersizes’]
with self.conn.cursor() as cur:
sql = "update article set chapters=%s,size=%s where articleid=%s"
args = (chapternums, sizes, self.chapter.articleid)
cur.execute(sql, args)
self.conn.commit()
def getPageArticles(self,article_page):
if article_page:
article_links = self.download_links(article_page, BASE_URL, ARTICLE_LINKS_XPATH)
if article_links:
for link in article_links:
article_page = self.download_page(link)
errortimes=0
if article_page:
chapter_links = self.download_links(article_page, link, CHAPTER_LINKS_XPATH)
# for i,chapter in enumerate(chapter_links):
# print(i,chapter)
self.parse_article(article_page,len(chapter_links),callback=self.saveArticle)
if self.article==None:
continue
if self.article.in_database==True and self.article.new_database==False :
continue
for order,c_link in enumerate(chapter_links):
if self.article.new_database==True:
if order+1<=self.article.database_chapters:
continue
chapter_page = self.download_page(c_link)
if chapter_page:
self.parse_chapters(chapter_page,order+1,self.article,self.saveChapters)
if self.chapter!=None:
self.updateSizesOfArticle(self.chapter)
else:
break
else:
with self.conn.cursor() as cur:
sql="insert into errorlog(articleid,articlename,chapterorder,chapterlink,errortime) values(%s,%s,%s,%s,%s)"
args=(self.article.articleid,self.article.name,order+1,c_link,time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())))
cur.execute(sql,args)
self.conn.commit()
break
def start(self):
article_page=self.download_page(START_URL)
while True:
self.getPageArticles(article_page)
next_url = self.download_links(article_page, BASE_URL, ARTICLE_NEXTPAGE_XPATH)
if next_url:
next_page = self.download_page(next_url[0])
if next_page:
article_page=next_page
else:
print(‘All Article been download,system exit.’)
break
self.close()
def close(self):
self.conn.close()
if __name__==’__main__’:
s=dushuspider()
s.start()
OSS“目录”下的小说爬取下来的小说内容