爬虫篇2–另一版本获取迅雷账号

作者：Hui
日期: 2016-06-15
分类: Python
浏览次数： 1,076

此版本引用了pycurl这个模块替代了urllib2，HTML解析器换成了lxml，效率上有一定提升~Download>>>

源码请点击更多~

#coding:utf8
#date:2016-06-15 23:39:03
#v2
url = 'http://xlfans.com/archives/category/迅雷分享'

from bs4 import BeautifulSoup
import pycurl

class Thunder(object):
    def __init__(self,url=None,htmlCode=None,email=None):
        self.url = url
        self.htmlCode = htmlCode
        self.email = email
    def getHtml(self):
        import StringIO
        curlObj = pycurl.Curl()
        bufferIO = StringIO.StringIO()
        curlObj.setopt(pycurl.URL,self.url)
        curlObj.setopt(pycurl.WRITEFUNCTION,bufferIO.write)
        curlObj.setopt(pycurl.FOLLOWLOCATION,1)
        curlObj.setopt(pycurl.MAXREDIRS,3)
        curlObj.setopt(pycurl.CONNECTTIMEOUT,10)
        curlObj.setopt(pycurl.USERAGENT,'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko')
        curlObj.perform()
        if curlObj.getinfo(curlObj.HTTP_CODE) != 200:
            print 'Bad code! %s exit...' % curlObj.getinfo(curlObj.HTTP_CODE)
            exit()
        codes = bufferIO.getvalue()
        #print curlObj.getinfo(curlObj.TOTAL_TIME)
        bufferIO.close()
        curlObj.close()
        return codes
    def createSoup(self):
        return BeautifulSoup(self.htmlCode,'lxml',from_encoding='utf-8')
    def decodeEmail(self):
        de = ""
        k = int(self.email[:2], 16)
        for i in range(2, len(self.email)-1, 2):
            de += chr(int(self.email[i:i+2], 16)^k)
        return de

L1 = []
L2 = []
L3 = []
L4 = []
def main():
    index = Thunder(url)
    index.htmlCode = index.getHtml()
    realUrl = index.createSoup().article.a.get('href')
    index.url = realUrl
    index.htmlCode = index.getHtml()
    childContent = index.createSoup().get_text().encode('utf-8')
    hasEmail = index.createSoup().find_all('a',class_='__cf_email__')
    import re
    for x in hasEmail:
        L3.append(Thunder(email=x['data-cfemail']).decodeEmail())
    for i in re.findall(r'迅雷白?.*',childContent):
        if not re.search(r'email',i):
            try:
                    L1.append(re.sub(r'密(码)?','',re.split(r':',re.sub(r'：',':',i.replace(' ','')))[1]))
                    L2.append(re.split(r':',re.sub(r'：',':',i.replace(' ','')))[-1])
            except Exception,e:
                pass
        else:
               L4.append(re.split(r':',re.sub(r'：',':',i.replace(' ','')))[-1])
    D1 = dict(zip(L1,L2))
    D2 = dict(zip(L3,L4))
    D1.update(D2)
    for x in D1:
        print '账号:%-25s密码:%s' %(x,D1[x])
    print 'Total account: %s' % len(D1)

if __name__ == '__main__':
   main()

#coding:utf8

#date:2016-06-15 23:39:03

#v2

url = 'http://xlfans.com/archives/category/迅雷分享'

from bs4 import BeautifulSoup

import pycurl

class Thunder(object):

def __init__(self,url=None,htmlCode=None,email=None):

self.url = url

self.htmlCode = htmlCode

self.email = email

def getHtml(self):

import StringIO

curlObj = pycurl.Curl()

bufferIO = StringIO.StringIO()

curlObj.setopt(pycurl.URL,self.url)

curlObj.setopt(pycurl.WRITEFUNCTION,bufferIO.write)

curlObj.setopt(pycurl.FOLLOWLOCATION,1)

curlObj.setopt(pycurl.MAXREDIRS,3)

curlObj.setopt(pycurl.CONNECTTIMEOUT,10)

curlObj.setopt(pycurl.USERAGENT,'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko')

curlObj.perform()

if curlObj.getinfo(curlObj.HTTP_CODE) != 200:

print 'Bad code! %s exit...' % curlObj.getinfo(curlObj.HTTP_CODE)

exit()

codes = bufferIO.getvalue()

#print curlObj.getinfo(curlObj.TOTAL_TIME)

bufferIO.close()

curlObj.close()

return codes

def createSoup(self):

return BeautifulSoup(self.htmlCode,'lxml',from_encoding='utf-8')

def decodeEmail(self):

de = ""

k = int(self.email[:2], 16)

for i in range(2, len(self.email)-1, 2):

de += chr(int(self.email[i:i+2], 16)^k)

return de

L1 = []

L2 = []

L3 = []

L4 = []

def main():

index = Thunder(url)

index.htmlCode = index.getHtml()

realUrl = index.createSoup().article.a.get('href')

index.url = realUrl

index.htmlCode = index.getHtml()

childContent = index.createSoup().get_text().encode('utf-8')

hasEmail = index.createSoup().find_all('a',class_='__cf_email__')

import re

for x in hasEmail:

L3.append(Thunder(email=x['data-cfemail']).decodeEmail())

for i in re.findall(r'迅雷白?.*',childContent):

if not re.search(r'email',i):

try:

L1.append(re.sub(r'密(码)?','',re.split(r':',re.sub(r'：',':',i.replace(' ','')))[1]))

L2.append(re.split(r':',re.sub(r'：',':',i.replace(' ','')))[-1])

except Exception,e:

pass

else:

L4.append(re.split(r':',re.sub(r'：',':',i.replace(' ','')))[-1])

D1 = dict(zip(L1,L2))

D2 = dict(zip(L3,L4))

D1.update(D2)

for x in D1:

print '账号:%-25s密码:%s' %(x,D1[x])

print 'Total account: %s' % len(D1)

if __name__ == '__main__':

main()

Tags: python

游客

website traffic service

Hi would you mind letting me know which hosting company you’re using?

I’ve loaded your blog in 3 different web browsers and I must say this blog loads
a lot quicker then most. Can you recommend a good web hosting
provider at a fair price? Thank you, I appreciate it!

作者

Hui

Hi sir, It’s hostkvm.A HongKong provider.Here is the link http://www.hostkvm.com/

Leif

This design is steller! You definitely know how to
keep a reader amused. Between your wit and your videos, I was almost moved to start my own blog (well, almost…HaHa!) Wonderful job.
I really loved what you had to say, and more than that, how you presented it.
Too cool!

Lenovo YT3-X50M/Yoga3 Tablet

Magnificent beat ! I wish to apprentice while you amend your website, how can i subscribe for a blog website?

The account helped me a acceptable deal. I had been a little bit acquainted of this your broadcast offered bright clear concept

爬虫篇2–另一版本获取迅雷账号

热门文章

近期文章

文章归档

爬虫篇2–另一版本获取迅雷账号

热门文章

近期文章

文章归档

标签