月色真美

月色真美

学渣的Python学习日记之爬取小说

14
2020-12-01

正文

学渣的PY之路1,做了一个简易的小说解析下载程序,大概就两百多行代码,实现了此站点下载所有小说的功能,不禁感慨PY的扩展包强大之处,似是完全了爬网站做的。编程环境是vs code + Anaconda,3.6的版本,废话不多说,上代码!

import urllib.request
import requests
import os
import time
import json
import re
from bs4 import BeautifulSoup
from urllib.parse import urlparse
txtPath = "txt"
getHeaders = {
    "Referer": "http://www.47uc.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
}
postHeaders = {
    "Host": "www.47uc.com",
    "Content-Type": "application/x-www-form-urlencoded",
    "Referer": "http://www.47uc.com/",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
}
def findHtml(url):
    try:
        return requests.get(url, getHeaders, timeout=10).content.decode('utf-8', 'ignore')
    except Exception as e:
        log("获取" + url + "页面出现异常:"+str(e))
        return ""
def findApi(api, postData, postHeaders):
    try:
        return requests.post(api, postData, postHeaders).text
    except Exception as e:
        log("获取" + api + "API数据出现异常:"+str(e)+" ; postData数据:"+str(postData))
        return ""
def downloadChapter(url, retry=0):
    html = findHtml(url)
    if html == "":
        return ""
    soup = BeautifulSoup(html, 'html.parser')
    content = soup.find("div", id="content")
    content_text = content.text.replace("  ", "\n  ").replace("o", "。")
    # 去除广告(未设计)
    # if retry != 0:
    # print(url+"-----第"+str(retry)+"次重试")
    if ("加载" in content_text) & retry < 4:
        # 获取url上的小说id,和当前章节id
        h = re.findall(r"var hash = \"([0-9a-z]+)\";", html)
        r = re.findall(
            r"http\:\/\/www\.47uc\.com\/[0-9]+_([0-9]+)\/([0-9]+)", url)
        # 请求api
        apiResponse = findApi(
            "http://www.47uc.com/home/index/ajaxchapter", {"id": r[0][0], "eKey": h, "cid": r[0][1], "basecid": 1}, postHeaders)
        if apiResponse == "":
            return ""
        try:
            j = json.loads(apiResponse)
            if ("info" in j.keys()) & ("content" in j["info"].keys()):
                return "\n\n"+j["info"]["content"].replace("<br>", "\n").replace("<br/>", "\n").replace("o", "。").replace("\xa0", " ")
            else:
                return downloadChapter(url, retry+1)
        except Exception as e:
            log("序列化json数据出现异常:"+str(e)+" ; JSON数据:"+str(apiResponse))
            return downloadChapter(url, retry+1)
    return content_text
def saveChapter(title, content):
    # 如果不存在文件夹,则创建
    folder = os.path.exists(txtPath)
    if not folder:
        os.makedirs(txtPath)
    # 保存
    append(title, content)
def append(title, content):
    with open(txtPath+"\\"+title+".txt", "ab+") as file:
        content = content.encode("utf-8")
        file.write(content)
def log(content):
    with open(txtPath+"\\run.log", "ab+") as file:
        content = (content+"\r\n").encode("utf-8")
        file.write(content)
# 字典1
number_map = {
    "零": 0,
    "一": 1,
    "二": 2,
    "三": 3,
    "四": 4,
    "五": 5,
    "六": 6,
    "七": 7,
    "八": 8,
    "九": 9,
}
# 字典2
unit_map = {
    "十": 10,
    "百": 100,
    "千": 1000,
    "万": 10000,
    "亿": 100000000
}
def convertToNumber(inputs):
    output = 0
    unit = 1
    num = 0
    for index, cn_num in enumerate(inputs):
        if cn_num in number_map:
            # 数字
            num = number_map[cn_num]
            # 最后的个位数字
            if index == len(inputs) - 1:
                output = output + num
        elif cn_num in unit_map:
            # 单位
            unit = unit_map[cn_num]
            if (unit == 10) & (num == 0):
                num = 1
            # 累加
            output = output + num * unit
            num = 0
        else:
            output = -1
            break
    return output
def findChapterNumber(title):
    if title == '第十章 生意人':
        print(title)
    result = re.findall(r"第([零|一|二|三|四|五|六|七|八|九|十|百|千|万|亿]+)章", title)
    if len(result) > 0:
        return convertToNumber(result[0])
    else:
        result = re.findall(r"第([0-9]+)章", title)
        if len(result) > 0:
            return int(result[0])
        elif "序章" in title:
            return 0
        elif "楔子" in title:
            return 0
        else:
            return -1
def sortSecond(item):
    return item[0]
def downloadTxt(url, name, chapterUrlList, missingChapterList, startNum=0, endNum=0):
    if startNum < 0:
        print("❌ 起始章节错误,终止爬取")
        return
    _u = urlparse(url)
    domain = _u.scheme + "://"+_u.netloc
    print("💯 抓取目录结构成功,正在分析....")
    html = findHtml(url)
    # 加载bs4
    soup = BeautifulSoup(html, 'html.parser')
    if len(chapterUrlList) < 1:
        print("❌ 此站点没有此小说目录")
        return
    print("⚡ 共分析"+str(len(chapterUrlList))+"章,有" +
          str(len(missingChapterList))+"章分析失败,开始爬取...")
    # 章节重新排序
    chapterUrlList.sort(key=sortSecond)
    # 下载各章节+转换
    count = len(chapterUrlList)
    index = 0
    for _chapter in chapterUrlList:
        index += 1
        content = downloadChapter(_chapter[1]).replace(_chapter[2].replace(
            " 上", "(上)").replace(" 下", "(下)").replace("章 ", "章"), "")
        if content == "":
            missingChapterList.append(_chapter[2]+" - 抓取失败")
            print("{:.2%} ".format(index/count) +
                  "🚀 正在抓取["+str(_chapter[0])+" - "+_chapter[2]+" - "+_chapter[1] + "] ❌ 服务端未返回")
            continue
        saveChapter(name, "\r\n\r\n" + _chapter[2] + "" + content)
        print("{:.2%} ".format(index/count) +
              "🚀 正在抓取["+str(_chapter[0])+" - "+_chapter[2]+" - "+_chapter[1] + "] ✔")
    # 失败章节提示
    if len(missingChapterList) > 0:
        print("存在失败的章节,如下\r\n"+"\r\n".join(missingChapterList))
    if (startNum == 0) & (endNum == 0):
        print("✔ 爬取0-"+str(count)+"章完毕")
    else:
        print("✔ 爬取"+str(startNum)+"-"+str(endNum)+"章完毕")
def findChapter(url):
    _u = urlparse(url)
    domain = _u.scheme + "://"+_u.netloc
    html = findHtml(url)
    # 加载bs4
    soup = BeautifulSoup(html, 'html.parser')
    chapterUrlList = []
    missingChapterList = []
    for _a in soup.select("div > dl > dd >a", id="list"):
        if _a is not None:
            _chapterNumber = findChapterNumber(_a.text)
            if _chapterNumber == -1:
                missingChapterList.append(_a.text + " - 解析失败")
                continue
            _chapter = (_chapterNumber, domain+_a.attrs["href"], _a.text)
            if _chapter not in chapterUrlList:
                chapterUrlList.append(_chapter)
    return (chapterUrlList, missingChapterList)
def searchTxt():
    exit = False
    while True & ~exit:
        title = input("🔸 请输入要下载的小说名:")
        searchResponse = findApi(
            "http://www.47uc.com/home/search", {"action": "search", "q": title}, postHeaders)
        if searchResponse == "":
            _input = print("❌ 搜索无结果,请尝试其他关键词,输入e退出")
            if _input == "e":
                exit = True
            else:
                continue
        try:
            soup = BeautifulSoup(searchResponse, 'html.parser')
            index = 0
            searchList = []
            for _item in soup.select("#hotcontent > div > ul > li > a"):
                if _item is not None:
                    index += 1
                    _url = "http://www.47uc.com"+_item.attrs["href"]
                    _c = findChapter(_url)
                    searchList.append((_url, _item.attrs["title"], _c))
                    print(
                        "🍀 "+str(index)+"."+_item.attrs["title"]+" 共"+str(len(_c[0]))+"章/有"+str(len(_c[1]))+"错章节")
            count = len(searchList)
            if count < 1:
                _input = print("❌ 搜索无结果,请尝试其他关键词,输入e退出")
                if _input == "e":
                    exit = True
                else:
                    continue
            x = input("🔸 请选择要下载的小说序号=>")
            while (type(eval(x)) != int) | (int(x) > len(searchList)):
                x = input("❌ 请输入正确的小说序号=>")
            _x = int(x)
            downloadTxt(searchList[_x-1][0], "《" +
                        searchList[_x-1][1]+"》", searchList[_x-1][2][0], searchList[_x-1][2][1])
            exit = True
        except Exception as e:
            log("检索小说出现异常:"+str(e)+" ; 检索返回数据:"+str(searchResponse))
            _input = print("❌ 搜索出现异常,请尝试其他关键词,输入e退出")
            if _input == "e":
                exit = True
            else:
                continue
searchTxt()

这个小说网比较特别,它不是纯静态的网页,而有人第一次查看某章节时,需要去调用一个章节接口,动态获取此章的内容,并后台静态化此章网页。显而易见,这个小说网站也是爬别人网站的小说内容的。附件就是爬下来的小说示例

附件