工作原理

先获取某个板块的Html页面，用正则表达式获取其中的帖子列表，然后随机抽一个。

获取帖子的标题，去生成比较靠谱的回复内容。

然后伪造回帖请求，完成回复。

主要技术

urllib.request

python中HTTP请求的发送可以通过urllib库实现。

构造请求头：

headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Host': 'www.miui.com',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip,deflate',
            'Cookie': cookie,
            'Referer': refer,
            'Pragma': 'no-cache',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Cache-Control':'no-cache'
    }

构造附带的post数据：

post={'message': messge, 'posttime': int(ans_time), 'formhash': hash, 'usesig': '1','subject': ''}
//转成url中的字符串对象
data = urllib.parse.urlencode(post)

发送请求：

//data要先encode成byte格式才能发送
req = urllib2.Request(url, data.encode("utf-8"),headers)

正则表达式

使用re.complie来实现。

import re
//re.complie(正则表达式)
pattern = re.compile(r'<a href="(.*?.html)" onclick=.*?>')   # 查找数字
//patten.findall(要进行匹配的字符串)，返回所有匹配结果的list
threadlist = pattern.findall(res)

定时器

20s触发一次回帖

def mainfunc:
    //函数内部的timer循环调用
    global timer
    timer = threading.Timer(20, mainfunc)
    timer.start()

全部代码

import urllib.request as urllib2
import threading
import random
import urllib.parse
import json
import time
import datetime

//设置自己的cookie
cookie=''

//分带post内容与不带参数版本

def fetch(url,refer):
    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Host': 'www.miui.com',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip,deflate',
            'Cookie': cookie,
            'Referer': refer,
            'Pragma': 'no-cache',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Cache-Control':'no-cache'
    }
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    page_source = response.read()
    return page_source
def fetch2(url,refer,post):
    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Host': 'www.miui.com',
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip,deflate',
            'Cookie': cookie,
            'Referer': refer,
            'Pragma': 'no-cache',
            'Upgrade-Insecure-Requests': '1',
            'Content-Type': 'application/x-www-form-urlencoded',
            'Cache-Control':'no-cache'
    }
    data = urllib.parse.urlencode(post)
    req = urllib2.Request(url, data.encode("utf-8"),headers)
    response = urllib2.urlopen(req)
    page_source = response.read()
    return page_source

def mainfunc():
    //在外部引用re会报错，函数内部无法使用re.complie 不知道为啥
    import re
    //在gid=14的板块下随机获取一个页面
    url='http://www.miui.com/forum.php?gid=14&page='+str(random.randint(1,24))
    html = fetch(url,'http://www.miui.com/index.html')
    res=html.decode('utf-8')
    //获取其中所有的帖子标题
    pattern = re.compile(r'<a href="(.*?.html)" onclick=.*?>') 
    threadlist = pattern.findall(res)
    //筛选，把不是帖子的和不是第一页的去掉，其实没啥用
    for i in threadlist:
        if i.find('thread-')=='-1':
            threadlist.remove(i)
        if i.find('-1-1.html')=='-1':
            threadlist.remove(i)
    //list去重
    threadlist = list(set(threadlist))
    //从list中随机取一个进行恢复
    refer= 'http://www.miui.com/'+threadlist[random.randint(0,len(threadlist)-1)]
    //获取帖子内容
    html = fetch(refer,'http://www.miui.com/index.html')
    res=html.decode('utf-8')
    //找到帖子的回复网址
    pattern = re.compile(r'action=\"(.*?)\"')
    threadpage = pattern.findall(res)
    reply = threadpage[1];
    //reply中把所有的&转义成了&amp;,手动拼回来
    replylist=reply.split("&amp;")
    reply=""
    for i in replylist:
        reply=reply+i+"&"
    //构造请求网址
    url='http://www.miui.com/'+reply+'inajax=1'
    //获取用户的表单hash验证值
    pattern = re.compile(r'<input type="hidden" name="formhash" value="(.*?)" \/>')
    temp = pattern.findall(res)
    hash=temp[0]
    hash=hash+":"+hash[::-1]
    //获取帖子主题
    pattern = re.compile(r'<meta name="description" content="(.*?)"\/>')
    temp = pattern.findall(res)
    content=temp[0]
    content=urllib.parse.quote(content)
    //调用图灵机器人APi获取回复内容
    tulingurl='http://www.tuling123.com/openapi/api?key=0344bd2d86f33bda5654b941f757137a&info='+content
    message=fetch(tulingurl,'')
    message=message.decode('UTF-8','strict')
    message=json.loads(message)
    messge=message["text"]
    //生成UNIX时间戳
    dtime = datetime.datetime.now()
    ans_time = time.mktime(dtime.timetuple())
    //构造回帖附带的参数
    post={'message': messge, 'posttime': int(ans_time), 'formhash': hash, 'usesig': '1','subject': ''}
    //发送回帖请求
    re=fetch2(url,refer,post);
    print(re.decode('UTF-8','strict'))
    //设置定时器，35秒执行一次,小米论坛每小时最多回复100贴，建议间隔设置不要少于35秒
    global timer
    timer = threading.Timer(35, mainfunc)
    timer.start()

//调用主函数
mainfunc()

小米论坛自动回帖脚本实现-Python版

工作原理

主要技术

urllib.request

正则表达式

定时器

全部代码

文章评论