1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97
import requests import json import urllib.parse from bs4 import BeautifulSoup
session = requests.Session()
url_home = input("输入微信分享链接:\n")
if len(url_home) == 0: url_home = "http://mp.weixin.qq.com/mp/homepage?__biz=MzIwMTg2NzU4Mw==&hid=12&sn=1cb2b9a74e73dd01951f7e67853209af&scene=18#wechat_redirect" print("使用默认链接:" + url_home + "\n\n")
ads = [ " ", '点击上方"蓝字"关注唐门大小事儿实时知!', "戳“阅读原文”火速进入下一章", "目前100000+人已关注加入我们", "点下広吿再走丨助小视能量满满~" ]
title = "斗罗大陆" s = session.get(url_home) title = BeautifulSoup(s.text, features="lxml").select( 'body > div > div > div > h2')[0].get_text().strip().replace("\n", "") print(title + "\n\n")
query = dict(urllib.parse.parse_qsl(urllib.parse.urlsplit(url_home).query))
__biz = query["__biz"] hid = query["hid"] sn = query["sn"] scene = query["scene"]
cids = 6 with open(title + ".txt", "w+") as file:
for cid in range(cids): data = { "__biz": __biz, "hid": hid, "sn": sn, "scene": scene, "cid": cid, "begin": 0, "count": 20, "action": "appmsg_list", "f": "json", "r": 0.27656332194465394 }
url = "https://mp.weixin.qq.com/mp/homepage" res = session.post(url, data=data) json1 = json.loads(res.content)
for zhangjie in json1["appmsg_list"]: chapter = zhangjie["title"] content = "" try: words = BeautifulSoup(session.get(zhangjie["link"]).text, features="lxml").select( "#js_content")[0].get_text().strip() for ad in ads: words = words.replace(ad, "")
n = 0 word_n = len(words)
one = False
for i in range(word_n): word = words[i] content += word
if (word == "“"): one = True elif (word == "”"): one = False
if (word == "。" and ((n > 200 and not one) or (n > 500))): content += "\n\n" n = 0 one = False
n += 1 print(chapter + ":\n " + content.replace("\n", "").strip()[0:50] + "……\n")
except IOError: content = "本章节抓取错误,无内容" print(chapter + ":\n " + content)
file.write(chapter + "\n\n" + content + "\n\n")