1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93
| from gevent import * from gevent import monkey monkey.patch_all()
import bs4 import requests import re import sys,os
import gevent.server from gevent.local import local import time
PA = re.compile('((?:http://)?[^/]+).+')
URL = 'xxxx' URL_ROOT = PA.search(URL).group(1) KEYWORD = (u'xxx',u'xxxx')
def cur_file_dir(): path = sys.path[0] if os.path.isdir(path): return path elif os.path.isfile(path): return os.path.dirname(path) print cur_file_dir()
n = 0 alldone = 0
proxies = { 'http': 'http://127.0.0.1:8087', 'https': 'http://127.0.0.1:8087', } def doworn(i,f): global n,alldone try: n+=1 r = requests.get(URL%(i,) ,proxies=proxies ,allow_redirects=False) n-=1 except Exception,e: print e alldone+=1 return try: soup = bs4.BeautifulSoup(r.content.decode('gbk')) for t in soup.find_all('h3'): title = unicode(t.contents[0].string) if re.search(r'|'.join(KEYWORD),title): try: print '%s,%s'%(title,t.contents[0]['href'],) f.write('%s,%s/%s\n'%(title.encode('utf8'),URL_ROOT,t.contents[0]['href'].encode('gbk'),)) except Exception,e: print e,t.contents[0]['href'] f.write(t.contents[0]['href']) except Exception,e: print e alldone+=1 def dofun(n,m,f): ll = [] for i in range(n,m+1): while n> 10: sleep(0.1) ll.append(spawn(doworn,i,f)) sleep(0.01) print 'all request been sended'
if __name__=='__main__': t = time.strftime( '%Y-%m-%d %H-%M-%S', time.localtime() ) filename = cur_file_dir()+'/result-%s.txt'%t print filename f = open(filename,'w') ND = 100 dofun(1,ND,f) while True: if alldone == ND: break sleep(1) f.close()
|