Python实现获取百度贴吧主页的前十页回复数,帖子id,主题 #!/usr/bin/python # -*- coding:utf-8 -*- import urllib import urllib
Python实现获取百度贴吧主页的前十页回复数,帖子id,主题
#!/usr/bin/python
# -*- coding:utf-8 -*-
import urllib
import urllib2
import re
import sys
reload (sys)
sys.setdefaultencoding('utf-8')
#处理页面标签类
class Tool:
#去除img标签,7位长空格
removeImg = re.compile('<img.*?>| {7}|')
#删除超链接标签
removeAddr = re.compile('<a.*?>|</a>')
#把换行的标签换为/n
replaceLine = re.compile('<tr>|<div>|</div>|</p>')
#将表格制表<td>替换为/t
replaceTD= re.compile('<td>')
#把段落开头换为/n加空两格
replacePara = re.compile('<p.*?>')
#将换行符或双换行符替换为/n
replaceBR = re.compile('<br><br>|<br>')
#将其余标签剔除
removeExtraTag = re.compile('<.*?>')
def replace(self,x):
x = re.sub(self.removeImg,"",x)
x = re.sub(self.removeAddr,"",x)
x = re.sub(self.replaceLine,"/n",x)
x = re.sub(self.replaceTD,"/t",x)
x = re.sub(self.replacePara,"/n ",x)
x = re.sub(self.replaceBR,"/n",x)
x = re.sub(self.removeExtraTag,"",x)
#strip()将前后多余内容删除
return x.strip()
#百度贴吧爬虫类
class BDTB:
def __init__(self,baseUrl):
self.baseURL = baseUrl
self.tool = Tool()
self.floor = 1
self.file = None
#传入贴吧主页网址,获取贴吧主页的代码
def getPage(self,pageNum):
try:
url = self.baseURL + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#print response.read()
return response.read().decode('utf-8')
except urllib2.URLError, e:
if hasattr(e,"reason"):
print u"连接百度贴吧失败,错误原因",e.reason
return None
#传入主页代码,获取帖子
def getTitle(self,page):
pattern = re.compile(r'<a href="/p/.*? title="(.*?)" .*? class="j_th_tit .*?>(.*?)</a>',re.S)
result = re.findall(pattern,page)
gettitles = []
for item in result:
gettitle = "/n"+"标题:"+self.tool.replace(item[1])+"/n"
gettitles.append(gettitle.encode('utf-8'))
return gettitles
#传入主页页码,获取帖子回复数
def getReplay(self,page):
pattern = re.compile(r'<span class="threadlist_rep_num center_text.*?>(.*?)</span>',re.S)
result = re.findall(pattern,page)
getreplays = []
#print result
for item in result:
#print self.tool.replace(getreplay)
getreplay = "/n"+"回复数:"+self.tool.replace(item)+"/n"
getreplays.append(getreplay.encode('utf-8'))
#print getreplays
return getreplays
#传入主页页码,获取帖子作者
#def getAuthor(self):
#page = self.getPage()
#pattern = re.compile(r'<a data-field=/'.*? class="frs-author-name.*? >(.*?)</a>',re.S)
#result = re.findall(pattern,page)
#for item in result:
#print item
#传入主页代码,获取帖子id
def getId(self,page):
pattern = re.compile(r'<a href="/p/(.*?)" title=".*?" class="j_th_tit .*?>.*?</a>',re.S)
result = re.findall(pattern,page)
getids = []
#print result
for item in result:
#print self.tool.replace(getid)
getid = "/n"+"帖子id:"+self.tool.replace(item)+"/n"
getids.append(getid.encode('utf-8'))
#print getids
return getids
def writeDate(self,getreplays,getids,gettitles):
reload(sys)
sys.setdefaultencoding('utf-8')
for (item, a, b) in zip(getreplays,getids,gettitles):
self.file.write(item)
self.file.write(a)
self.file.write(b)
c = "------------------------------------------------------------------------"
self.file.write(c)
self.floor += 1
def start(self,title):
indexPage = self.getPage(0)
gettitle = self.getTitle(indexPage)
self.file = open(title + ".txt","a+")
print "写入文件名称成功!"
if indexPage == None:
print "URL已失效,请重试"
return
try:
print "获取贴吧前十页帖子"
for i in range(0,10):
print "正在写入第" + str(i) + "页数据"
page = self.getPage(i*50)
getreplays = self.getReplay(page)
getids = self.getId(page)
gettitles = self.getTitle(page)
self.writeDate(getreplays,getids,gettitles)
except IOError,e:
print "写入异常,原因" + e.message
finally:
print "写入任务完成"
print u"请输入贴吧名:"
baseURL = 'http://tieba.baidu.com/f?kw=' + raw_input(u'') + '&fr=ala0&tpl=5'
print u"请再一次输入贴吧名称" #输入的名称为保存数据的txt文件名称
title = raw_input(u'')
#title = "贴吧主页"
bdtb = BDTB(baseURL)
bdtb.start(title)
格式粘贴时有问题,直接拷贝可能会有问题,请注意。