爬虫篇2–另一版本获取迅雷账号
此版本引用了pycurl这个模块替代了urllib2,HTML解析器换成了lxml,效率上有一定提升~Download>>>
源码请点击更多~
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
#coding:utf8 #date:2016-06-15 23:39:03 #v2 url = 'http://xlfans.com/archives/category/迅雷分享' from bs4 import BeautifulSoup import pycurl class Thunder(object): def __init__(self,url=None,htmlCode=None,email=None): self.url = url self.htmlCode = htmlCode self.email = email def getHtml(self): import StringIO curlObj = pycurl.Curl() bufferIO = StringIO.StringIO() curlObj.setopt(pycurl.URL,self.url) curlObj.setopt(pycurl.WRITEFUNCTION,bufferIO.write) curlObj.setopt(pycurl.FOLLOWLOCATION,1) curlObj.setopt(pycurl.MAXREDIRS,3) curlObj.setopt(pycurl.CONNECTTIMEOUT,10) curlObj.setopt(pycurl.USERAGENT,'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko') curlObj.perform() if curlObj.getinfo(curlObj.HTTP_CODE) != 200: print 'Bad code! %s exit...' % curlObj.getinfo(curlObj.HTTP_CODE) exit() codes = bufferIO.getvalue() #print curlObj.getinfo(curlObj.TOTAL_TIME) bufferIO.close() curlObj.close() return codes def createSoup(self): return BeautifulSoup(self.htmlCode,'lxml',from_encoding='utf-8') def decodeEmail(self): de = "" k = int(self.email[:2], 16) for i in range(2, len(self.email)-1, 2): de += chr(int(self.email[i:i+2], 16)^k) return de L1 = [] L2 = [] L3 = [] L4 = [] def main(): index = Thunder(url) index.htmlCode = index.getHtml() realUrl = index.createSoup().article.a.get('href') index.url = realUrl index.htmlCode = index.getHtml() childContent = index.createSoup().get_text().encode('utf-8') hasEmail = index.createSoup().find_all('a',class_='__cf_email__') import re for x in hasEmail: L3.append(Thunder(email=x['data-cfemail']).decodeEmail()) for i in re.findall(r'迅雷白?.*',childContent): if not re.search(r'email',i): try: L1.append(re.sub(r'密(码)?','',re.split(r':',re.sub(r':',':',i.replace(' ','')))[1])) L2.append(re.split(r':',re.sub(r':',':',i.replace(' ','')))[-1]) except Exception,e: pass else: L4.append(re.split(r':',re.sub(r':',':',i.replace(' ','')))[-1]) D1 = dict(zip(L1,L2)) D2 = dict(zip(L3,L4)) D1.update(D2) for x in D1: print '账号:%-25s密码:%s' %(x,D1[x]) print 'Total account: %s' % len(D1) if __name__ == '__main__': main() |
Hi would you mind letting me know which hosting company you’re using?
I’ve loaded your blog in 3 different web browsers and I must say this blog loads
a lot quicker then most. Can you recommend a good web hosting
provider at a fair price? Thank you, I appreciate it!
Hi sir, It’s hostkvm.A HongKong provider.Here is the link http://www.hostkvm.com/
This design is steller! You definitely know how to
keep a reader amused. Between your wit and your videos, I was almost moved to start my own blog (well, almost…HaHa!) Wonderful job.
I really loved what you had to say, and more than that, how you presented it.
Too cool!
Magnificent beat ! I wish to apprentice while you amend your website, how can i subscribe for a blog website?
The account helped me a acceptable deal. I had been a little bit acquainted of this your broadcast offered bright clear concept