帖子检索

gevent,并发网络访问
BeautifulSoup,网页解析
re,关键字匹配、网站根路径获取
requests,http访问、代理支持

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#coding=utf8
from gevent import *
from gevent import monkey
monkey.patch_all()

import bs4
import requests
import re
import sys,os

import gevent.server
from gevent.local import local
import time
#import jieba
#jieba.initialize()#optional

PA = re.compile('((?:http://)?[^/]+).+')

URL = 'xxxx'#如 http://example.com/xx/page=%d
URL_ROOT = PA.search(URL).group(1)
KEYWORD = (u'xxx',u'xxxx')#xxxx 关键词

def cur_file_dir():
path = sys.path[0]
if os.path.isdir(path):
return path
elif os.path.isfile(path):
return os.path.dirname(path)
print cur_file_dir()

n = 0
alldone = 0
#代理,这里是goagent的本地代理
proxies = {
'http': 'http://127.0.0.1:8087',
'https': 'http://127.0.0.1:8087',
}
def doworn(i,f):
global n,alldone
try:
n+=1
r = requests.get(URL%(i,)
,proxies=proxies
,allow_redirects=False)
n-=1
except Exception,e:
print e
alldone+=1
return
#print r.content.decode('gbk')
try:
soup = bs4.BeautifulSoup(r.content.decode('gbk'))
for t in soup.find_all('h3'):
title = unicode(t.contents[0].string)
#print title
if re.search(r'|'.join(KEYWORD),title):
try:
print '%s,%s'%(title,t.contents[0]['href'],)
f.write('%s,%s/%s\n'%(title.encode('utf8'),URL_ROOT,t.contents[0]['href'].encode('gbk'),))

except Exception,e:
print e,t.contents[0]['href']
#alldone+=1
f.write(t.contents[0]['href'])
except Exception,e:
print e
alldone+=1
def dofun(n,m,f):
ll = []

for i in range(n,m+1):
#print i
while n> 10:
sleep(0.1)
ll.append(spawn(doworn,i,f))
sleep(0.01)
print 'all request been sended'

if __name__=='__main__':
t = time.strftime( '%Y-%m-%d %H-%M-%S', time.localtime() )
filename = cur_file_dir()+'/result-%s.txt'%t
print filename
f = open(filename,'w')
ND = 100#75


dofun(1,ND,f)
while True:
if alldone == ND:
break
sleep(1)
f.close()